## Case Cohort merged with new variables 

rm(list = ls())
library(tidyverse)
library(caret)
library(haven)
library(stringr)
library(survey)
library(splines)
library(rlang)
library(lmtest)
library(sandwich)
library(broom)
library(MASS)
library(matrixStats)
set.seed(02138)

# Set working directories
# setwd("~/Dropbox (Harvard University)/Gov 2001 Rep Paper/R Scripts") 
# setwd("~/Dropbox (Harvard University)/Gov_2001_Rep/R Scripts") 

# Read sampled control group
d <- sampled_control_group <- readRDS("sampled_control_group.RDS") %>% rename(namea = state1ab, nameb = state2ab, statea = state1no, stateb = state2no, strtyr = year)
nrow(sampled_control_group) #905


#variables in MCT 
years <- seq(1918,2001,1)
datasets<- c("polity", "polity2","boix_dem", "cheibub_dem", "prze_dem", "van_dem", "cinc", "major_power", "nuclear_power", "resolve", "alliance_score", "contig")

#  major_a + alliance_gg + cinc_a + cinc_b + (recruit + female_suffrage + milex_quintile + reg_power_rank + e_peaveduc) + (democ_a + democ_b)

polity <- readRDS("polityIII.RDS") %>% rename(name=COUNTRY)

polity2 <- readRDS("polity2.RDS")

cinc <- readRDS("cinc.RDS")%>% rename(stabb=stateabb, styear=year)

major_power <- readRDS("major_power.RDS")  %>% distinct()

major_power_year_averages <- major_power %>% 
  group_by(namea, strtyr) %>% 
  summarise_at(vars(major_power), mean)

nuclear_power <- readRDS("nuclear_power.RDS")  %>% distinct()

resolve <- readRDS("resolve.RDS")  %>% distinct()

years <- seq(1918,2001,1)

joined <- d %>% # 1378
  # left_join(., y = dplyr::select(polity, -c(YEAR, nameb,name)), by = c("namea", "strtyr")) %>%
  # left_join(., y = dplyr::select(polity, -c(YEAR, namea,name)), by = c("nameb", "strtyr")) %>%
  left_join(., y = cinc, by = c("namea" = "stabb", "strtyr" ="styear")) %>%rename(cinca = cinc) %>%
  left_join(., y = cinc, by = c("nameb" = "stabb", "strtyr" ="styear")) %>%rename(cincb = cinc) %>%
  left_join(., y = major_power_year_averages, by = c("namea", "strtyr")) %>%
  left_join(., y = major_power_year_averages, by = c("nameb" = "namea", "strtyr")) %>%
  left_join(.,y = polity2, by=c("namea"= "scode", strtyr = "year")) %>% rename(demo_a=demo) %>%
  left_join(.,y = polity2, by=c("nameb"= "scode", strtyr = "year")) %>% rename(demo_b=demo)

  
  
## DO WE TAKE AN AVERAGE FOR THE START DAY END DAY/YEAR FOR MAJOR POWER SCORES (don't include nuclear in current model specs tho !)
  # left_join(., y = major_power, by = c("namea" = "namea", "strtyr", "strtday","endyear","endday")) %>%
  # left_join(., y = major_power, by = c("nameb" = "namea", "strtyr", "strtday","endyear","endday")) %>%
  # left_join(., y = nuclear_power, by = c("namea" = "namea", "strtyr", "strtmnth", "strtday","endyear","endday","endmnth")) %>%
  # left_join(., y = nuclear_power, by = c("nameb" = "namea", "strtyr", "strtmnth", "strtday","endyear","endday","endmnth")) 
  

sum(as.numeric(is.na(joined$polity2))) #138 missing 
sum(as.numeric(is.na(joined$demo_a))) #138 missing, better than 
sum(as.numeric(is.na(joined$DEMOC.x))) #281
sum(as.numeric(is.na(joined$conttype))) #579 missing 

# Rename variables
joined <- joined %>%rename(autoc_a = AUTOC.x, democ_a = DEMOC.x,  democ_b = DEMOC.y, xrcomp_a = XRCOMP.x, parreg_a = PARREG.x,  major_a= major_power.x,major_b = major_power.y, 
    cinc_a = cinca, cinc_b = cincb)

names(joined)

#post-treatment confounders   (also in MCT)

powers <- readRDS("powers.RDS")
vdem_sub <- readRDS("vdem_subset.RDS")
regime <- readRDS("regime.RDS")
conscription <- readRDS("conscription.RDS")
leadership <- readRDS("leadership.RDS")

joined <- joined %>% 
  left_join(., y = dplyr::select(powers, c(year, country_text,reg_power_rank, reg_power_quintile, milex_rank, milex_quintile, world_power_rank)), 
            by = c("namea" = "country_text", "strtyr" = "year"))%>% 
  left_join(., y = dplyr::select(powers, c(year, country_text,reg_power_rank, reg_power_quintile, milex_rank, milex_quintile, world_power_rank)), 
            by = c("nameb" = "country_text", "strtyr" = "year")) %>% 
  left_join(., y = vdem_sub, 
            by = c("statea" = "COWcode", "strtyr" = "year"))%>% 
  left_join(., y = vdem_sub, 
            by = c("stateb" = "COWcode", "strtyr" = "year")) %>%
  left_join(., y = dplyr::select(regime, - c(lindex, country_code)), 
          by = c("namea" = "country_text", "strtyr" = "year"))%>%
  left_join(., y = dplyr::select(regime, - c(lindex, country_code)), 
          by = c("nameb" = "country_text", "strtyr" = "year")) %>%
  left_join(., y = dplyr::select(conscription, -  country_code), 
            by = c("namea" = "country_text", "strtyr" = "year")) %>%
  left_join(., y = dplyr::select(conscription, -  country_code),
            by = c("nameb" = "country_text", "strtyr" = "year")) 
#  %>% left_join(., y = dplyr::select(leadership, - country_code), 
#            by = c("namea" = "country_text", "strtyr" = "year")) %>%
#  left_join(., y = dplyr::select(leadership, - country_code), 
#           by = c("nameb" = "country_text", "strtyr" = "year"))
  
sum(as.numeric(is.na(joined$female_suffrage.x))) #6 NAs

joined <- joined %>%
rename(reg_power_rank = reg_power_rank.x, 
       reg_power_quintile = reg_power_quintile.x, 
       milex_rank = milex_rank.x,
       milex_quintile = milex_quintile.x,
       world_power_rank = world_power_rank.x,
       female_suffrage = female_suffrage.x,
       fem_house = fem_house.x,
       e_peaveduc = e_peaveduc.x,
       regime_r = regime_r.x,
       recruit = recruit.x)

saveRDS(joined, "control_gp_all_vars.RDS")


